Abstract:
This is a technical blog post of both an HTML file and .qmd file hosted on GitHub pages.
embed-resources option in the header.format:
html:
embed-resources: true
Step Up Code:
sh <- suppressPackageStartupMessages
sh(library(tidyverse))
sh(library(caret))
wine <- readRDS(gzcon(url("https://github.com/cd-public/D505/raw/master/dat/pinot.rds")))
Calculate the probability that a Pinot comes from Burgundy given it has the word ‘fruit’ in the description.
\[ P({\rm Burgundy}~|~{\rm Fruit}) \]
## Creating Fruit Feature
wino <- wine %>%
mutate(fruit = str_detect(description, "fruit"))
wino
## Conditional Probability
burgundy_and_fruit <- nrow(filter(wino, province == "Burgundy" & fruit)) / nrow(wino)
fruit <- nrow(filter(wino, fruit)) / nrow(wino)
burgundy_and_fruit / fruit
## [1] 0.2196038
We train a naive bayes algorithm to classify a wine’s province using: 1. An 80-20 train-test split. 2. Three features engineered from the description 3. 5-fold cross validation.
We report Kappa after using the model to predict provinces in the holdout sample.
## Searching for Common Words
library(tidytext)
wine_desc <- wine %>%
unnest_tokens(word, description) %>%
count(word) %>%
arrange(desc(n))
wine_desc
## acidity, cherry, tart
## Creating 3 Features
wino <- wine %>%
mutate(cherry = str_detect(description, "cherry")) %>%
mutate(tart = str_detect(description, "tart")) %>%
mutate(acidity = str_detect(description, "acidity"))
wino
## 80-20 Split
set.seed(5)
wine_index <- createDataPartition(wino$province, p = 0.8, list = FALSE)
train <- wino[wine_index, ]
test <- wino[-wine_index, ]
## 5-Fold Cross Validation
train_control <- trainControl(method = "cv", number = 5)
fit <- train(province ~.,
data = train,
method = "naive_bayes",
metric = "Kappa",
trControl = train_control)
We find the three words that most distinguish New York Pinots from all other Pinots.
## Filtering for Pinots
pinots <- wine %>%
filter(str_detect(description, "(?i)pinot"))
pinots
## Finding Popular Terms for Pinots
pinots_words <- wine %>%
unnest_tokens(word, description) %>%
filter(nchar(word) > 3) %>%
count(word) %>%
arrange(desc(n))
pinots_words
pinots_words_freq <- pinots_words %>%
mutate(pinot_freq = n/sum(n))
pinots_words_freq
## Filtering for New York Pinots
ny_pinots <- pinots %>%
filter(province == "New_York")
ny_pinot_words <- ny_pinots %>%
unnest_tokens(word, description) %>%
filter(nchar(word) > 3) %>%
count(word) %>%
arrange(desc(n))
ny_pinot_words
ny_pinot_words_freq <- ny_pinot_words %>%
mutate(ny_freq = n/sum(n))
ny_pinot_words_freq
word_freq_diff <- full_join(pinots_words_freq, ny_pinot_words_freq, by = "word") %>%
mutate(freq_diff = pinot_freq - ny_freq) %>%
arrange(freq_diff)
word_freq_diff